{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "import pandas as pd\n",
    "import pickle as cPickle\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "from numpy.random import random  \n",
    "from collections import defaultdict\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  推荐系统类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "class RecommonderSystem:\n",
    "  def __init__(self):\n",
    "    # 读入数据做初始化\n",
    "    \n",
    "    #用户和活动新的索引\n",
    "    self.userIndex = cPickle.load(open(\"Data/PE_userIndex.pkl\", 'rb'))\n",
    "    self.eventIndex = cPickle.load(open(\"Data/PE_eventIndex.pkl\", 'rb'))\n",
    "    self.n_users = len(self.userIndex)\n",
    "    self.n_items = len(self.eventIndex)\n",
    "    \n",
    "    #用户-活动关系矩阵R\n",
    "    self.userEventScores = sio.mmread(\"Data/PE_userEventScores\").todense()\n",
    "    \n",
    "    #倒排表\n",
    "    ##每个用户参加的事件\n",
    "    self.itemsForUser = cPickle.load(open(\"Data/PE_eventsForUser.pkl\", 'rb'))\n",
    "    ##事件参加的用户\n",
    "    self.usersForItem = cPickle.load(open(\"Data/PE_usersForEvent.pkl\", 'rb'))\n",
    "    \n",
    "    #基于模型的协同过滤参数初始化,训练\n",
    "    self.init_SVD()\n",
    "    self.train_SVD(trainfile = \"Data/train.csv\")\n",
    "    \n",
    "    #根据用户属性计算出的用户之间的相似度\n",
    "    self.userSimMatrix = sio.mmread(\"Data/US_userSimMatrix\").todense()\n",
    "    \n",
    "    #根据活动属性计算出的活动之间的相似度\n",
    "    self.eventPropSim = sio.mmread(\"Data/EV_eventPropSim\").todense()\n",
    "    self.eventContSim = sio.mmread(\"Data/EV_eventContSim\").todense()\n",
    "    \n",
    "    #每个用户的朋友的数目\n",
    "    self.numFriends = sio.mmread(\"Data/UF_numFriends\")\n",
    "    #用户的每个朋友参加活动的分数对该用户的影响\n",
    "    self.userFriends = sio.mmread(\"Data/UF_userFriends\").todense()\n",
    "    \n",
    "    #活动本身的热度\n",
    "    self.eventPopularity = sio.mmread(\"Data/EA_eventPopularity\").todense()\n",
    "\n",
    "  def init_SVD(self, K=20):\n",
    "    #初始化模型参数（for 基于模型的协同过滤SVD_CF）\n",
    "    self.K = K  \n",
    "    \n",
    "    #init parameters\n",
    "    #bias\n",
    "    self.bi = np.zeros(self.n_items)  \n",
    "    self.bu = np.zeros(self.n_users)  \n",
    "    \n",
    "    #the small matrix\n",
    "    self.P = random((self.n_users,self.K))/10*(np.sqrt(self.K))\n",
    "    self.Q = random((self.K, self.n_items))/10*(np.sqrt(self.K))  \n",
    "                  \n",
    "          \n",
    "  def train_SVD(self,trainfile = 'Data/train.csv', steps=100,gamma=0.04,Lambda=0.15):\n",
    "    #训练SVD模型（for 基于模型的协同过滤SVD_CF）\n",
    "    #gamma：为学习率\n",
    "    #Lambda：正则参数\n",
    "    \n",
    "    print (\"SVD Train...\")\n",
    "    ftrain = open(trainfile, 'r')\n",
    "    ftrain.readline()\n",
    "    self.mu = 0.0\n",
    "    n_records = 0\n",
    "    uids = []  #每条记录的用户索引\n",
    "    i_ids = [] #每条记录的item索引\n",
    "    #用户-Item关系矩阵R（内容同userEventScores相同），临时变量，训练完了R不再需要\n",
    "    R = np.zeros((self.n_users, self.n_items))\n",
    "    \n",
    "    for line in ftrain:\n",
    "        cols = line.strip().split(\",\")\n",
    "        u = self.userIndex[cols[0]]  #用户\n",
    "        i = self.eventIndex[cols[1]] #活动\n",
    "        \n",
    "        uids.append(u)\n",
    "        i_ids.append(i)\n",
    "        \n",
    "        R[u,i] = int(cols[4])  #interested\n",
    "        self.mu += R[u,i] # mu 是整个评分的一个平均数\n",
    "        n_records += 1\n",
    "    \n",
    "    ftrain.close()\n",
    "    self.mu /= n_records\n",
    "    \n",
    "    # code begin \n",
    "    \n",
    "    for step in range(steps):  \n",
    "        rmse_sum=0.0 \n",
    "\n",
    "        # 随机打乱样本顺序\n",
    "        kk = np.random.permutation(n_records)  \n",
    "        for j in range(n_records):  \n",
    "            \n",
    "            u = uids[kk[j]]\n",
    "            i = i_ids[kk[j]]\n",
    "            # 残差计算    \n",
    "            re_ui = R[u,i] - self.pred_SVD(u,i) - self.bu[u] - self.bi[i]- np.dot(self.P[u,:],self.Q[:,i])\n",
    "            re_ui= 1 if re_ui>1 else 0\n",
    "            # 残差平方和\n",
    "            rmse_sum += re_ui **2\n",
    "                            \n",
    "            #SGD\n",
    "            self.bu[u] += gamma * (re_ui - Lambda * self.bu[u])  \n",
    "            self.bi[i] += gamma * (re_ui - Lambda * self.bi[i]) \n",
    "            # 更新 P Q\n",
    "            for k in range(self.K):\n",
    "                self.P[u,k] += gamma * re_ui * self.Q[k,i] - Lambda * self.P[u,k]\n",
    "                self.Q[k,i] += gamma * re_ui * self.P[u,k] - Lambda * self.Q[k,i]\n",
    "        \n",
    "        rmse_sum = 0.5 * rmse_sum\n",
    "        \n",
    "        pmatrix_sum = 0.5 * Lambda * self.P[u,:].dot(self.P[u,:].T)\n",
    "        qmatrix_sum = 0.5 * Lambda * self.Q[:,i].dot(self.Q[:,i].T)\n",
    "        \n",
    "        # 偏置\n",
    "        b_u_sum = 0.5 * Lambda* (self.bu**2).sum()\n",
    "        b_i_sum =0.5 * Lambda* (self.bi**2).sum()\n",
    "\n",
    "        # 目标函数\n",
    "        sse = rmse_sum + ( pmatrix_sum + qmatrix_sum )+ (b_u_sum + b_i_sum)\n",
    "        if sse < 0.001:\n",
    "            print (\"the sse is\" +sse)\n",
    "        break\n",
    "        \n",
    "        delta = 0.95 \n",
    "        # 学习率递减\n",
    "        gamma = gamma*delta  \n",
    "    \n",
    "    # code end\n",
    "    \n",
    "    # 请补充完整SVD模型训练过程\n",
    "    print (\"SVD trained\")\n",
    "    \n",
    "  def pred_SVD(self, uid, i_id):\n",
    "    #根据当前参数，预测用户uid对Item（i_id）的打分        \n",
    "    ans=self.mu + self.bi[i_id] + self.bu[uid] + np.dot(self.P[uid,:],self.Q[:,i_id])  \n",
    "        \n",
    "    #将打分范围控制在0-1之间\n",
    "    if ans>1:  \n",
    "        return 1  \n",
    "    elif ans<0:  \n",
    "        return 0\n",
    "    return ans  \n",
    "\n",
    "  def sim_cal_UserCF(self, uid1, uid2 ):\n",
    "    #请补充基于用户的协同过滤中的两个用户uid1和uid2之间的相似度\n",
    "    \n",
    "    # code begin\n",
    "    both_i = {}  # 两个用户均有打分的item\n",
    "    \n",
    "    for item in self.itemsForUser[uid1]:  # uid1所有打过分的item1\n",
    "        if item in self.itemsForUser[uid2]:  # 如果uid2也对该item打过分\n",
    "            both_i[item]=1  \n",
    "        \n",
    "    n= len(both_i)   \n",
    "    if (n==0):  \n",
    "        return 0  \n",
    "        \n",
    "    # uid1打过分的有效item\n",
    "    s1= np.array([self.userEventScores[uid1,item] for item in both_i])  \n",
    "        \n",
    "    # uid2打过分的有效item\n",
    "    s2= np.array([self.userEventScores[uid2,item] for item in both_i])  \n",
    "        \n",
    "    sum1= np.sum(s1)  \n",
    "    sum2= np.sum(s2)  \n",
    "    sum1Sq= np.sum(s1**2)  \n",
    "    sum2Sq= np.sum(s2**2)  \n",
    "    pSum= np.sum(s1*s2)  \n",
    "        \n",
    "    num= pSum-(sum1*sum2 / n)  \n",
    "        \n",
    "    den= np.sqrt((sum1Sq-sum1**2 / n) * (sum2Sq-sum2**2 / n))  \n",
    "    if den==0:  \n",
    "        return 0  \n",
    "        \n",
    "    similarity = num/den  \n",
    "    return similarity   \n",
    "    # code end\n",
    "    \n",
    "\n",
    "  def userCFReco(self, userId, eventId):\n",
    "    \"\"\"\n",
    "    根据User-based协同过滤，得到event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i\n",
    "      for every other user v that has a preference for i\n",
    "        compute similarity s between u and v\n",
    "        incorporate v's preference for i weighted by s into running average\n",
    "    return top items ranked by weighted average\n",
    "    \"\"\"\n",
    "    #请补充完整代码\n",
    "    \n",
    "    # code begin\n",
    "    \n",
    "    u = self.userIndex[userId]\n",
    "    i = self.eventIndex[eventId]\n",
    "    \n",
    "    sim_accumulate=0.0  \n",
    "    rat_acc=0.0  \n",
    "    \n",
    "    for user in self.usersForItem[i]:  #对eventId打过分的所有用户\n",
    "        \n",
    "        sim = self.sim_cal_UserCF(uid1 = user,uid2 = u)    #该user与uid之间的相似度\n",
    "        if sim == 0:continue  \n",
    "            \n",
    "        rat_acc += sim * self.userEventScores[user,i]   #用户user对eventId的打分\n",
    "        \n",
    "        sim_accumulate += sim  \n",
    "        \n",
    "    if sim_accumulate==0: \n",
    "        return  self.mu  \n",
    "    ans = rat_acc/sim_accumulate  \n",
    "\n",
    "    #打分控制在0-1间\n",
    "    if ans>1:  \n",
    "        return 1  \n",
    "    elif ans<0:  \n",
    "        return 0  \n",
    "    return ans\n",
    "\n",
    "    # code end\n",
    "    \n",
    "\n",
    "  def sim_cal_ItemCF(self, i_id1, i_id2):\n",
    "    #计算Item i_id1和i_id2之间的相似性\n",
    "    \n",
    "    # code begin\n",
    "    \n",
    "    both_u= {}  # 有效用户集合\n",
    "    for user in self.usersForItem[i_id1]:  # 所有对item1打过分的的user\n",
    "        if user in self.usersForItem[i_id2]:  # 如果该用户对item2也打过分\n",
    "            both_u[user]=1  # 有效用用户\n",
    "        \n",
    "    n= len(both_u)  \n",
    "    if (n==0):  \n",
    "        return 0  \n",
    "        \n",
    "    s1= np.array([self.userEventScores[u, i_id1] for u in both_u])  \n",
    "        \n",
    "    s2= np.array([self.userEventScores[u, i_id2] for u in both_u])  \n",
    "        \n",
    "    sum1= np.sum(s1)  \n",
    "    sum2= np.sum(s2)  \n",
    "    sum1Sq= np.sum(s1**2)  \n",
    "    sum2Sq= np.sum(s2**2)  \n",
    "    pSum= np.sum(s1*s2)  \n",
    "        \n",
    "    num= pSum-(sum1*sum2/n)  \n",
    "        \n",
    "    den= np.sqrt((sum1Sq-sum1**2 / n) * (sum2Sq-sum2**2 / n))  \n",
    "    if den==0:  \n",
    "        return 0  \n",
    "        \n",
    "    return num/den \n",
    "    \n",
    "    # code end\n",
    "\n",
    "            \n",
    "  def eventCFReco(self, userId, eventId):    \n",
    "    \"\"\"\n",
    "    根据基于物品的协同过滤，得到Event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i \n",
    "        for every item j tht u has a preference for\n",
    "            compute similarity s between i and j\n",
    "            add u's preference for j weighted by s to a running average\n",
    "    return top items, ranked by weighted average\n",
    "    \"\"\"\n",
    "    #请补充完整代码\n",
    "    ans = 0.0\n",
    "    \n",
    "    # code begin\n",
    "    \n",
    "    u = self.userIndex[userId]\n",
    "    i = self.eventIndex[eventId]\n",
    "\n",
    "    sim_accumulate=0.0  \n",
    "    rat_acc=0.0  \n",
    "                   \n",
    "    for item in self.itemsForUser[u]:  # 用户uid打过分的所有item\n",
    "        sim = self.sim_cal_ItemCF(item,i)    #该item与i之间的相似度\n",
    "           \n",
    "        rat_acc += sim * self.userEventScores[u,item]  \n",
    "        sim_accumulate += sim  \n",
    "        \n",
    "    if sim_accumulate==0: \n",
    "        return  self.mu  \n",
    "\n",
    "    ans = rat_acc/sim_accumulate  \n",
    "\n",
    "    # 打分范围控制在 0-1\n",
    "    if ans>1:  \n",
    "        return 1  \n",
    "    elif ans<0:  \n",
    "        return 0\n",
    "    return ans\n",
    "    \n",
    "    # code end\n",
    "    \n",
    "  def svdCFReco(self, userId, eventId):\n",
    "    #基于模型的协同过滤, SVD++/LFM\n",
    "    u = self.userIndex[userId]\n",
    "    i = self.eventIndex[eventId]\n",
    "\n",
    "    return self.pred_SVD(u,i)\n",
    "\n",
    "  def userReco(self, userId, eventId):\n",
    "    \"\"\"\n",
    "    类似基于User-based协同过滤，只是用户之间的相似度由用户本身的属性得到，计算event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i\n",
    "      for every other user v that has a preference for i\n",
    "        compute similarity s between u and v\n",
    "        incorporate v's preference for i weighted by s into running aversge\n",
    "    return top items ranked by weighted average\n",
    "    \"\"\"\n",
    "    i = self.userIndex[userId]\n",
    "    j = self.eventIndex[eventId]\n",
    "\n",
    "    vs = self.userEventScores[:, j]\n",
    "    sims = self.userSimMatrix[i, :]\n",
    "\n",
    "    prod = sims * vs\n",
    "\n",
    "    try:\n",
    "      return prod[0, 0] - self.userEventScores[i, j]\n",
    "    except IndexError:\n",
    "      return 0\n",
    "\n",
    "  def eventReco(self, userId, eventId):\n",
    "    \"\"\"\n",
    "    类似基于Item-based协同过滤，只是item之间的相似度由item本身的属性得到，计算Event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i \n",
    "      for every item j that u has a preference for\n",
    "        compute similarity s between i and j\n",
    "        add u's preference for j weighted by s to a running average\n",
    "    return top items, ranked by weighted average\n",
    "    \"\"\"\n",
    "    i = self.userIndex[userId]\n",
    "    j = self.eventIndex[eventId]\n",
    "    js = self.userEventScores[i, :]\n",
    "    psim = self.eventPropSim[:, j]\n",
    "    csim = self.eventContSim[:, j]\n",
    "    pprod = js * psim\n",
    "    cprod = js * csim\n",
    "    \n",
    "    pscore = 0\n",
    "    cscore = 0\n",
    "    try:\n",
    "      pscore = pprod[0, 0] - self.userEventScores[i, j]\n",
    "    except IndexError:\n",
    "      pass\n",
    "    try:\n",
    "      cscore = cprod[0, 0] - self.userEventScores[i, j]\n",
    "    except IndexError:\n",
    "      pass\n",
    "    return pscore, cscore\n",
    "\n",
    "  def userPop(self, userId):\n",
    "    \"\"\"\n",
    "    基于用户的朋友个数来推断用户的社交程度\n",
    "    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动\n",
    "    \"\"\"\n",
    "    if userId in self.userIndex:\n",
    "      i = self.userIndex[userId]\n",
    "      try:\n",
    "        return self.numFriends[0, i]\n",
    "      except IndexError:\n",
    "        return 0\n",
    "    else:\n",
    "      return 0\n",
    "\n",
    "  def friendInfluence(self, userId):\n",
    "    \"\"\"\n",
    "    朋友对用户的影响\n",
    "    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的\n",
    "    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响\n",
    "    \"\"\"\n",
    "    nusers = np.shape(self.userFriends)[1]\n",
    "    i = self.userIndex[userId]\n",
    "    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]\n",
    "\n",
    "  def eventPop(self, eventId):\n",
    "    \"\"\"\n",
    "    本活动本身的热度\n",
    "    主要是通过参与的人数来界定的\n",
    "    \"\"\"\n",
    "    i = self.eventIndex[eventId]\n",
    "    return self.eventPopularity[i, 0]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 生成推荐数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generateRSData(RS, train=True, header=True):\n",
    "    \"\"\"\n",
    "    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起\n",
    "    生成新的训练数据，用于分类器分类使用\n",
    "    \"\"\"\n",
    "    fn = \"train.csv\" if train else \"test.csv\"\n",
    "    fin = open(fn, 'rb')\n",
    "    fout = open(\"RS_\" + fn, 'w')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    fin.readline().decode().strip().split(\",\")\n",
    "    \n",
    "    # write output header\n",
    "    if header:\n",
    "      ocolnames = [\"invited\", \"userCF_reco\", \"evtCF_reco\",\"svdCF_reco\",\"user_reco\", \"evt_p_reco\",\n",
    "        \"evt_c_reco\", \"user_pop\", \"frnd_infl\", \"evt_pop\"]\n",
    "      if train:\n",
    "        ocolnames.append(\"interested\")\n",
    "        ocolnames.append(\"not_interested\")\n",
    "      fout.write(\",\".join(ocolnames) + \"\\n\")\n",
    "    \n",
    "    ln = 0\n",
    "    for line in fin:\n",
    "      ln += 1\n",
    "      if ln%500 == 0:\n",
    "          print (\"%s:%d (userId, eventId)=(%s, %s)\" % (fn, ln, userId, eventId))\n",
    "          #break\n",
    "      \n",
    "      cols = line.decode().strip().split(\",\")\n",
    "      userId = cols[0]\n",
    "      eventId = cols[1]\n",
    "      invited = cols[2]\n",
    "      \n",
    "      userCF_reco = RS.userCFReco(userId, eventId)\n",
    "      itemCF_reco = RS.eventCFReco(userId, eventId)\n",
    "      svdCF_reco = RS.svdCFReco(userId, eventId)\n",
    "        \n",
    "      user_reco = RS.userReco(userId, eventId)\n",
    "      evt_p_reco, evt_c_reco = RS.eventReco(userId, eventId)\n",
    "      user_pop = RS.userPop(userId)\n",
    "     \n",
    "      frnd_infl = RS.friendInfluence(userId)\n",
    "      evt_pop = RS.eventPop(eventId)\n",
    "      ocols = [invited, userCF_reco, itemCF_reco, svdCF_reco,user_reco, evt_p_reco,\n",
    "        evt_c_reco, user_pop, frnd_infl, evt_pop]\n",
    "      \n",
    "      if train:\n",
    "        ocols.append(cols[4]) # interested\n",
    "        ocols.append(cols[5]) # not_interested\n",
    "      fout.write(\",\".join(map(lambda x: str(x), ocols)) + \"\\n\")\n",
    "    \n",
    "    fin.close()\n",
    "    fout.close()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 实例化推荐系统类,生成推荐数据文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SVD Train...\n",
      "SVD trained\n",
      "生成训练数据...\n",
      "\n",
      "train.csv:500 (userId, eventId)=(123290209, 1887085024)\n",
      "train.csv:1000 (userId, eventId)=(272886293, 199858305)\n",
      "train.csv:1500 (userId, eventId)=(395305791, 1582270949)\n",
      "train.csv:2000 (userId, eventId)=(527523423, 3272728211)\n",
      "train.csv:2500 (userId, eventId)=(651258472, 792632006)\n",
      "train.csv:3000 (userId, eventId)=(811791433, 524756826)\n",
      "train.csv:3500 (userId, eventId)=(985547042, 1269035551)\n",
      "train.csv:4000 (userId, eventId)=(1107615001, 173949238)\n",
      "train.csv:4500 (userId, eventId)=(1236336671, 3849306291)\n",
      "train.csv:5000 (userId, eventId)=(1414301782, 2652356640)\n",
      "train.csv:5500 (userId, eventId)=(1595465532, 955398943)\n",
      "train.csv:6000 (userId, eventId)=(1747091728, 2131379889)\n",
      "train.csv:6500 (userId, eventId)=(1914182220, 955398943)\n",
      "train.csv:7000 (userId, eventId)=(2071842684, 1076364848)\n",
      "train.csv:7500 (userId, eventId)=(2217853337, 3051438735)\n",
      "train.csv:8000 (userId, eventId)=(2338481531, 2525447278)\n",
      "train.csv:8500 (userId, eventId)=(2489551967, 520657921)\n",
      "train.csv:9000 (userId, eventId)=(2650493630, 87962584)\n",
      "train.csv:9500 (userId, eventId)=(2791418962, 4223848259)\n",
      "train.csv:10000 (userId, eventId)=(2903662804, 2791462807)\n",
      "train.csv:10500 (userId, eventId)=(3036141956, 3929507420)\n",
      "train.csv:11000 (userId, eventId)=(3176074542, 3459485614)\n",
      "train.csv:11500 (userId, eventId)=(3285425249, 2271782630)\n",
      "train.csv:12000 (userId, eventId)=(3410667855, 1063772489)\n",
      "train.csv:12500 (userId, eventId)=(3531604778, 2584839423)\n",
      "train.csv:13000 (userId, eventId)=(3686871863, 53495098)\n",
      "train.csv:13500 (userId, eventId)=(3833637800, 2415873572)\n",
      "train.csv:14000 (userId, eventId)=(3944021305, 2096772901)\n",
      "train.csv:14500 (userId, eventId)=(4075466480, 3567240505)\n",
      "train.csv:15000 (userId, eventId)=(4197193550, 1628057176)\n",
      "生成测试数据...\n",
      "\n",
      "test.csv:500 (userId, eventId)=(182290053, 2529072432)\n",
      "test.csv:1000 (userId, eventId)=(433510318, 4244463632)\n",
      "test.csv:1500 (userId, eventId)=(632808865, 2845303452)\n",
      "test.csv:2000 (userId, eventId)=(813611885, 2036538169)\n",
      "test.csv:2500 (userId, eventId)=(1010701404, 303459881)\n",
      "test.csv:3000 (userId, eventId)=(1210932037, 2529072432)\n",
      "test.csv:3500 (userId, eventId)=(1452921099, 2705317682)\n",
      "test.csv:4000 (userId, eventId)=(1623287180, 1626678328)\n",
      "test.csv:4500 (userId, eventId)=(1855201342, 2603032829)\n",
      "test.csv:5000 (userId, eventId)=(2083900381, 2529072432)\n",
      "test.csv:5500 (userId, eventId)=(2318415276, 2509151803)\n",
      "test.csv:6000 (userId, eventId)=(2528161539, 4025975316)\n",
      "test.csv:6500 (userId, eventId)=(2749110768, 4244406355)\n",
      "test.csv:7000 (userId, eventId)=(2927772127, 1532377761)\n",
      "test.csv:7500 (userId, eventId)=(3199685636, 1776393554)\n",
      "test.csv:8000 (userId, eventId)=(3393388475, 680270887)\n",
      "test.csv:8500 (userId, eventId)=(3601169721, 154434302)\n",
      "test.csv:9000 (userId, eventId)=(3828963415, 3067222491)\n",
      "test.csv:9500 (userId, eventId)=(4018723397, 2522610844)\n",
      "test.csv:10000 (userId, eventId)=(4180064266, 2658555390)\n"
     ]
    }
   ],
   "source": [
    "RS = RecommonderSystem()\n",
    "print (\"生成训练数据...\\n\")\n",
    "generateRSData(RS,train=True,  header=True)\n",
    "print (\"生成测试数据...\\n\")\n",
    "generateRSData(RS, train=False, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>invited</th>\n",
       "      <th>userCF_reco</th>\n",
       "      <th>evtCF_reco</th>\n",
       "      <th>svdCF_reco</th>\n",
       "      <th>user_reco</th>\n",
       "      <th>evt_p_reco</th>\n",
       "      <th>evt_c_reco</th>\n",
       "      <th>user_pop</th>\n",
       "      <th>frnd_infl</th>\n",
       "      <th>evt_pop</th>\n",
       "      <th>interested</th>\n",
       "      <th>not_interested</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.488980</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.059746e+00</td>\n",
       "      <td>0.971727</td>\n",
       "      <td>0.000231</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.441393</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.820462e-01</td>\n",
       "      <td>0.551649</td>\n",
       "      <td>0.000231</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.000012</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>1.667252</td>\n",
       "      <td>-1.000000e+00</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.000231</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.000018</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.567770</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.199822e+00</td>\n",
       "      <td>0.813238</td>\n",
       "      <td>0.000231</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000032</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.179932</td>\n",
       "      <td>2.258463e-07</td>\n",
       "      <td>0.977072</td>\n",
       "      <td>0.000231</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000039</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.000025</td>\n",
       "      <td>2.032554e-07</td>\n",
       "      <td>0.977072</td>\n",
       "      <td>0.000231</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.000163</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.337757</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.923380e+00</td>\n",
       "      <td>2.989740</td>\n",
       "      <td>0.000160</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.000005</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.270455</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.927441e-01</td>\n",
       "      <td>0.896462</td>\n",
       "      <td>0.000160</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000044</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.270662</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-4.044681e-01</td>\n",
       "      <td>0.924551</td>\n",
       "      <td>0.000160</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000064</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   invited  userCF_reco  evtCF_reco  svdCF_reco  user_reco    evt_p_reco  \\\n",
       "0        0     0.268282    0.268282    0.488980   0.000000  1.059746e+00   \n",
       "1        0     0.268282    0.268282    0.441393   0.000000  1.820462e-01   \n",
       "2        0     0.268282    0.268282    0.268282   1.667252 -1.000000e+00   \n",
       "3        0     0.268282    0.268282    0.567770   0.000000  1.199822e+00   \n",
       "4        0     0.268282    0.268282    0.268282   0.179932  2.258463e-07   \n",
       "5        0     0.268282    0.268282    0.268282   0.000025  2.032554e-07   \n",
       "6        0     0.268282    0.268282    0.337757   0.000000  1.923380e+00   \n",
       "7        0     0.268282    0.268282    0.270455   0.000000  1.927441e-01   \n",
       "8        0     0.268282    0.268282    0.270662   0.000000 -4.044681e-01   \n",
       "\n",
       "   evt_c_reco  user_pop  frnd_infl   evt_pop  interested  not_interested  \n",
       "0    0.971727  0.000231        0.0  0.000000           0               0  \n",
       "1    0.551649  0.000231        0.0 -0.000012           0               0  \n",
       "2   -1.000000  0.000231        0.0 -0.000018           1               0  \n",
       "3    0.813238  0.000231        0.0  0.000032           0               0  \n",
       "4    0.977072  0.000231        0.0  0.000039           0               0  \n",
       "5    0.977072  0.000231        0.0 -0.000163           0               0  \n",
       "6    2.989740  0.000160        0.0 -0.000005           0               0  \n",
       "7    0.896462  0.000160        0.0  0.000044           1               0  \n",
       "8    0.924551  0.000160        0.0  0.000064           1               0  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rs_train = pd.read_csv(\"RS_train.csv\")\n",
    "rs_train.head(9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>invited</th>\n",
       "      <th>userCF_reco</th>\n",
       "      <th>evtCF_reco</th>\n",
       "      <th>svdCF_reco</th>\n",
       "      <th>user_reco</th>\n",
       "      <th>evt_p_reco</th>\n",
       "      <th>evt_c_reco</th>\n",
       "      <th>user_pop</th>\n",
       "      <th>frnd_infl</th>\n",
       "      <th>evt_pop</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000118</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000118</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000118</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.000002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000118</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.881089</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000118</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.000030</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000118</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000253</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000118</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-0.000014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.860794</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000009</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000037</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>0.268282</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000009</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000028</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   invited  userCF_reco  evtCF_reco  svdCF_reco  user_reco  evt_p_reco  \\\n",
       "0        0     0.268282    0.268282    1.000000        0.0         0.0   \n",
       "1        0     0.268282    0.268282    1.000000        0.0         0.0   \n",
       "2        0     0.268282    0.268282    1.000000        0.0         0.0   \n",
       "3        0     0.268282    0.268282    1.000000        0.0         0.0   \n",
       "4        0     0.268282    0.268282    0.881089        0.0         0.0   \n",
       "5        0     0.268282    0.268282    1.000000        0.0         0.0   \n",
       "6        0     0.268282    0.268282    1.000000        0.0         0.0   \n",
       "7        0     0.268282    0.268282    0.860794        0.0         0.0   \n",
       "8        0     0.268282    0.268282    1.000000        0.0         0.0   \n",
       "\n",
       "   evt_c_reco  user_pop  frnd_infl   evt_pop  \n",
       "0         0.0  0.000118        0.0  0.000124  \n",
       "1         0.0  0.000118        0.0  0.000002  \n",
       "2         0.0  0.000118        0.0 -0.000002  \n",
       "3         0.0  0.000118        0.0  0.000035  \n",
       "4         0.0  0.000118        0.0 -0.000030  \n",
       "5         0.0  0.000118        0.0  0.000253  \n",
       "6         0.0  0.000118        0.0 -0.000014  \n",
       "7         0.0  0.000009        0.0  0.000037  \n",
       "8         0.0  0.000009        0.0  0.000028  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rs_test = pd.read_csv(\"RS_test.csv\")\n",
    "rs_test.head(9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
